clear
set more off

* Set working folder to main folder

import delimited using input\CUSP\patents_fyear_iyear_from_1900.csv, clear

keep if fyear >= 1916 & fyear <= 1920

keep patnum fyear fmonth 

drop if fyear == . | fmonth == . 

save "input\migration_of_inventors\temp.dta", replace

import delimited using input\CUSP\help_files\unique_ids\patents_inventor_name_location_id_90.csv, clear

gen aaa1 = substr(id,1,1)
encode aaa1, gen(bbb1)
gen aaa2 = substr(id,2,.)
destring aaa2, gen(bbb2)
gen inventor_id = 1000000*bbb1 + bbb2
drop aaa* bbb*

keep patnum inv_name inv_city inv_state inventor_id

duplicates drop patnum inv_name, force

replace inv_name = lower(inv_name)
replace inv_city = lower(inv_city)
replace inv_state = lower(inv_state)

bys patnum: gen fraction_of_patent = 1/_N

merge m:1 patnum using "input\migration_of_inventors\temp.dta", keep(match) nogen

gen fyear_fmonth = 100*fyear + fmonth

save "input\migration_of_inventors\temp1.dta", replace

* Import dataset with birthplace information:
import delimited using input\inventors_census_matches\inventors_age_bpl.csv, clear

keep patnum inv_name weight bpl

merge m:1 patnum inv_name using "input\migration_of_inventors\temp1.dta", keep(match) nogen

save "input\migration_of_inventors\temp2.dta", replace

*****************************************

* Now, for each patent-inventor, compute "share" born in the US (share of weights) - probabilistic approach

use "input\migration_of_inventors\temp2.dta", clear

gen usborn = 0
replace usborn = 1 if bpl < 100 & bpl != .

bys patnum inv_name: egen weight_usborn = sum(weight*usborn)

drop weight bpl usborn
duplicates drop patnum inv_name, force

drop if weight_usborn == 0

save "input\migration_of_inventors\temp6.dta", replace

*****************************************

*** Generate dataset of patents filed only by inventors born in the US:

*** Now, we want to:

** 1) Match inv_city with one of the 50 NPI cities, and keep only the ones with an NPI city:

* First, the 43 cities:
import delimited using input\CUSP\inv_patents_assigned_to_npi_cities.csv, clear

replace inv_name = lower(inv_name)
replace inv_city = lower(inv_city)
replace inv_state = lower(inv_state)

keep patnum inv_name npi_id

duplicates drop patnum inv_name, force

merge 1:1 patnum inv_name using "input\migration_of_inventors\temp6.dta", keep(match) nogen
save "input\migration_of_inventors\temp3.dta", replace

* Then, the 7 extra cities:
import delimited using input\CUSP\inv_patents_assigned_to_cls_cities.csv, clear

replace inv_name = lower(inv_name)
replace inv_city = lower(inv_city)
replace inv_state = lower(inv_state)

keep patnum inv_name cls_id

duplicates drop patnum inv_name, force

merge 1:1 patnum inv_name using "input\migration_of_inventors\temp6.dta", keep(match) nogen

append using "input\migration_of_inventors\temp3.dta"
* NOTE: Here there are duplicates (patents with npi_id also have a cls_id). But this will be fixed in the next step
save "input\migration_of_inventors\temp3.dta", replace

* Now, put everything together:

use "output\cities_expandedpatents.dta", clear
keep if gen_id!=. & npi_id!=.
keep gen_id npi_id city
duplicates drop
save "input\migration_of_inventors\temp4a.dta", replace

use "output\cities_expandedpatents.dta", clear
keep if gen_id!=. & cls_id!=.
keep gen_id cls_id city
duplicates drop
save "input\migration_of_inventors\temp4b.dta", replace

use "input\migration_of_inventors\temp3.dta", clear
merge m:1 npi_id using "input\migration_of_inventors\temp4a.dta", keep(match) nogen
save "input\migration_of_inventors\temp5.dta", replace

use "input\migration_of_inventors\temp3.dta", clear
merge m:1 cls_id using "input\migration_of_inventors\temp4b.dta", keep(match) nogen
append using "input\migration_of_inventors\temp5.dta"
save "input\migration_of_inventors\temp5.dta", replace

** 2) Count patents by npi-city:

bys fyear fmonth gen_id: egen totpat_usborn = sum(fraction_of_patent*weight_usborn)

keep fyear fmonth totpat city gen_id

duplicates drop

sort gen_id fyear fmonth

export delimited using input\migration_of_inventors\usborn_city_totpat_fyear_fmonth.csv, replace

* Erase temporary files:

erase input\migration_of_inventors\temp.dta
erase input\migration_of_inventors\temp1.dta
erase input\migration_of_inventors\temp2.dta
erase input\migration_of_inventors\temp3.dta 
erase input\migration_of_inventors\temp4a.dta 
erase input\migration_of_inventors\temp4b.dta 
erase input\migration_of_inventors\temp5.dta
erase input\migration_of_inventors\temp6.dta
